{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "view-in-github"
   },
   "source": [
    "<a href=\"https://colab.research.google.com/github/Ankur3107/GitHub-Bugs-Prediction-Challenge/blob/main/%20model-template/MachineHack_Distilbert.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 595
    },
    "id": "yclGd-8Ifckh",
    "outputId": "e7e3fe11-2e72-4e67-ac54-c98f7836b936"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting transformers\n",
      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/19/22/aff234f4a841f8999e68a7a94bdd4b60b4cebcfeca5d67d61cd08c9179de/transformers-3.3.1-py3-none-any.whl (1.1MB)\n",
      "\u001b[K     |████████████████████████████████| 1.1MB 3.3MB/s \n",
      "\u001b[?25hCollecting sentencepiece!=0.1.92\n",
      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)\n",
      "\u001b[K     |████████████████████████████████| 1.1MB 15.4MB/s \n",
      "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)\n",
      "Collecting tokenizers==0.8.1.rc2\n",
      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)\n",
      "\u001b[K     |████████████████████████████████| 3.0MB 19.4MB/s \n",
      "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.41.1)\n",
      "Collecting sacremoses\n",
      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)\n",
      "\u001b[K     |████████████████████████████████| 890kB 45.3MB/s \n",
      "\u001b[?25hRequirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.7)\n",
      "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)\n",
      "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.18.5)\n",
      "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)\n",
      "Requirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from transformers) (20.4)\n",
      "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.15.0)\n",
      "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)\n",
      "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.16.0)\n",
      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)\n",
      "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.6.20)\n",
      "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.10)\n",
      "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->transformers) (2.4.7)\n",
      "Building wheels for collected packages: sacremoses\n",
      "  Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
      "  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=cd3b259ca8f2a4c4f2f46a7885630d3a45ea29fbe2b2db161544cc2bda387b14\n",
      "  Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45\n",
      "Successfully built sacremoses\n",
      "Installing collected packages: sentencepiece, tokenizers, sacremoses, transformers\n",
      "Successfully installed sacremoses-0.0.43 sentencepiece-0.1.91 tokenizers-0.8.1rc2 transformers-3.3.1\n"
     ]
    }
   ],
   "source": [
    "!pip install transformers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 374
    },
    "id": "RMX0exNQeuno",
    "outputId": "80ede5c0-ea30-4ff7-cd60-046009936017"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2020-10-14 12:10:36--  https://machinehack-be.s3.amazonaws.com/predict_github_issues_embold_sponsored_hackathon/Embold_Participant%27s_Dataset.zip\n",
      "Resolving machinehack-be.s3.amazonaws.com (machinehack-be.s3.amazonaws.com)... 52.219.66.124\n",
      "Connecting to machinehack-be.s3.amazonaws.com (machinehack-be.s3.amazonaws.com)|52.219.66.124|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 102320961 (98M) [application/octet-stream]\n",
      "Saving to: ‘Embold_Participant's_Dataset.zip’\n",
      "\n",
      "Embold_Participant' 100%[===================>]  97.58M  12.9MB/s    in 9.2s    \n",
      "\n",
      "2020-10-14 12:10:46 (10.6 MB/s) - ‘Embold_Participant's_Dataset.zip’ saved [102320961/102320961]\n",
      "\n",
      "Archive:  ./Embold_Participant's_Dataset.zip\n",
      "   creating: Dataset/Embold_Participant's_Dataset/\n",
      "  inflating: Dataset/Embold_Participant's_Dataset/sample submission.csv  \n",
      "  inflating: Dataset/__MACOSX/Embold_Participant's_Dataset/._sample submission.csv  \n",
      "  inflating: Dataset/Embold_Participant's_Dataset/embold_train_extra.json  \n",
      "  inflating: Dataset/__MACOSX/Embold_Participant's_Dataset/._embold_train_extra.json  \n",
      "  inflating: Dataset/Embold_Participant's_Dataset/embold_test.json  \n",
      "  inflating: Dataset/__MACOSX/Embold_Participant's_Dataset/._embold_test.json  \n",
      "  inflating: Dataset/Embold_Participant's_Dataset/embold_train.json  \n",
      "  inflating: Dataset/__MACOSX/Embold_Participant's_Dataset/._embold_train.json  \n"
     ]
    }
   ],
   "source": [
    "!wget https://machinehack-be.s3.amazonaws.com/predict_github_issues_embold_sponsored_hackathon/Embold_Participant%27s_Dataset.zip\n",
    "!unzip ./Embold_Participant\\'s_Dataset.zip -d Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "id": "zZre2HKzfJhU",
    "outputId": "f64efa4e-038c-43f7-a92f-01a0d38a5010"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/content/Dataset/Embold_Participant's_Dataset\n"
     ]
    }
   ],
   "source": [
    "cd \"Dataset/Embold_Participant's_Dataset/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "IkgF1XjCfL8s"
   },
   "outputs": [],
   "source": [
    "#import pandas as pd\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "id": "jUArON7dfL6L"
   },
   "outputs": [],
   "source": [
    "train_df = pd.read_json(\"embold_train.json\").reset_index(drop=True)\n",
    "test_df = pd.read_json(\"embold_test.json\").reset_index(drop=True)\n",
    "train_ex_df = pd.read_json(\"embold_train_extra.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "id": "PIWj_QLZdE2w"
   },
   "outputs": [],
   "source": [
    "test_label_df = pd.read_csv('/content/ensemble_v6.csv')\n",
    "test_df['label'] = test_label_df['label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "id": "RVjD4XyZfL3X"
   },
   "outputs": [],
   "source": [
    "train_data = train_df.append(train_ex_df)\n",
    "test_df['text'] = test_df['title']+' '+test_df['body']\n",
    "train_data['text'] = train_data['title']+' '+train_data['body']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "id": "KeiyQYc5fLz-"
   },
   "outputs": [],
   "source": [
    "from dataclasses import dataclass\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import tensorflow as tf\n",
    "from tensorflow.keras.optimizers import Adam\n",
    "from transformers import *\n",
    "from transformers import pipeline\n",
    "from pprint import pprint\n",
    "from tensorflow import keras"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 765
    },
    "id": "1LzwPjc4fLv8",
    "outputId": "790d327c-5a70-4de6-b7f1-3adc5540685c"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running on TPU  grpc://10.104.51.98:8470\n",
      "INFO:tensorflow:Initializing the TPU system: grpc://10.104.51.98:8470\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Initializing the TPU system: grpc://10.104.51.98:8470\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Clearing out eager caches\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Clearing out eager caches\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Finished initializing TPU system.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Finished initializing TPU system.\n",
      "WARNING:absl:`tf.distribute.experimental.TPUStrategy` is deprecated, please use  the non experimental symbol `tf.distribute.TPUStrategy` instead.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Found TPU system:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Found TPU system:\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Num TPU Cores: 8\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Num TPU Cores: 8\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Num TPU Workers: 1\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Num TPU Workers: 1\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Num TPU Cores Per Worker: 8\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Num TPU Cores Per Worker: 8\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "REPLICAS:  8\n"
     ]
    }
   ],
   "source": [
    "@dataclass\n",
    "class Config:\n",
    "    MAX_LEN = 320\n",
    "    BATCH_SIZE = 16  # per TPU core\n",
    "    TOTAL_STEPS = 2000  # thats approx 4 epochs\n",
    "    EVALUATE_EVERY = 200\n",
    "    LR = 1e-5\n",
    "    PRETRAINED_MODEL = \"distilroberta-base\"  # huggingface bert model\n",
    "\n",
    "\n",
    "transformer_flags = Config()\n",
    "\n",
    "\n",
    "def connect_to_TPU():\n",
    "    \"\"\"Detect hardware, return appropriate distribution strategy\"\"\"\n",
    "    try:\n",
    "        # TPU detection. No parameters necessary if TPU_NAME environment variable is\n",
    "        # set: this is always the case on Kaggle.\n",
    "        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n",
    "        print(\"Running on TPU \", tpu.master())\n",
    "    except ValueError:\n",
    "        tpu = None\n",
    "\n",
    "    if tpu:\n",
    "        tf.config.experimental_connect_to_cluster(tpu)\n",
    "        tf.tpu.experimental.initialize_tpu_system(tpu)\n",
    "        strategy = tf.distribute.experimental.TPUStrategy(tpu)\n",
    "    else:\n",
    "        # Default distribution strategy in Tensorflow. Works on CPU and single GPU.\n",
    "        strategy = tf.distribute.get_strategy()\n",
    "\n",
    "    global_batch_size = transformer_flags.BATCH_SIZE * strategy.num_replicas_in_sync\n",
    "\n",
    "    return tpu, strategy, global_batch_size\n",
    "\n",
    "\n",
    "tpu, strategy, global_batch_size = connect_to_TPU()\n",
    "print(\"REPLICAS: \", strategy.num_replicas_in_sync)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "id": "7HMNtvnJfpJe"
   },
   "outputs": [],
   "source": [
    "def regular_encode(texts, tokenizer, maxlen=512):\n",
    "    enc_di = tokenizer.batch_encode_plus(\n",
    "        texts,\n",
    "        return_attention_mask=False,\n",
    "        return_token_type_ids=False,\n",
    "        pad_to_max_length=True,\n",
    "        max_length=maxlen,\n",
    "        truncation=True,\n",
    "    )\n",
    "\n",
    "    return np.array(enc_di[\"input_ids\"])\n",
    "\n",
    "from transformers import *\n",
    "#tokenizer = AutoTokenizer.from_pretrained(transformer_flags.PRETRAINED_MODEL)\n",
    "#tokenizer = BertTokenizerFast.from_pretrained(transformer_flags.PRETRAINED_MODEL)\n",
    "#tokenizer = AlbertTokenizer.from_pretrained(transformer_flags.PRETRAINED_MODEL, use_fast=True)\n",
    "tokenizer = RobertaTokenizerFast.from_pretrained(transformer_flags.PRETRAINED_MODEL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "id": "ioqdj5xUfrXJ"
   },
   "outputs": [],
   "source": [
    "def build_model():\n",
    "  input_layer = tf.keras.layers.Input((transformer_flags.MAX_LEN,), dtype=tf.int32)\n",
    "  bert_model = TFAutoModel.from_pretrained(transformer_flags.PRETRAINED_MODEL)\n",
    "  #bert_model = TFAutoModel.from_pretrained('/content/github-albert-xlarge-v2/')\n",
    "  #bert_model = TFAutoModel.from_pretrained('../../github-roberta-large/')\n",
    "  output_layer = bert_model(input_layer) \n",
    "  output = tf.keras.layers.Dense(3, activation='softmax')(output_layer[1])\n",
    "  classifier_model = tf.keras.Model(input_layer, output)\n",
    "\n",
    "  optimizer = tf.keras.optimizers.Adam(learning_rate=transformer_flags.LR)\n",
    "  classifier_model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n",
    "  classifier_model.summary()\n",
    "    \n",
    "  return classifier_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "id": "TzkBWN_pBh2S"
   },
   "outputs": [],
   "source": [
    "all_train = train_data[0:200000]\n",
    "all_train = all_train.append(train_data[230000:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "id": "xKGMpxrQB7pN"
   },
   "outputs": [],
   "source": [
    "question_df = all_train[all_train.label==2]\n",
    "bug_issue = all_train[all_train.label!=2]\n",
    "bug_issue = bug_issue.sample(150000)\n",
    "train = question_df.append(bug_issue)\n",
    "train = train.append(test_df)\n",
    "train = train.sample(len(train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 85
    },
    "id": "QLLE_OuYBkCL",
    "outputId": "0a972d51-f276-4b25-ec0b-452822b18854"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    90207\n",
       "0    87139\n",
       "2    41979\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.label.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 85
    },
    "id": "jBDJJ9Q_CoyL",
    "outputId": "3e2d0c7c-b70f-4e15-fa71-6785f6f27cbe"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    13846\n",
       "0    13278\n",
       "2     2876\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "validate = train_data[200000:230000]\n",
    "validate.label.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "id": "TyDywIGVCsv2",
    "outputId": "210a431e-95fd-4c3b-ad9e-e3d1c2edc25c"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((219325, 4), (30000, 4))"
      ]
     },
     "execution_count": 14,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.shape, validate.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 360
    },
    "id": "yf2E7SlSfx4A",
    "outputId": "3f10abc6-1491-447e-d0ca-741ab8aa476a"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at distilroberta-base were not used when initializing TFRobertaModel: ['lm_head']\n",
      "- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n",
      "- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "All the weights of TFRobertaModel were initialized from the model checkpoint at distilroberta-base.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"functional_1\"\n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "input_1 (InputLayer)         [(None, 320)]             0         \n",
      "_________________________________________________________________\n",
      "tf_roberta_model (TFRobertaM ((None, 320, 768), (None, 82118400  \n",
      "_________________________________________________________________\n",
      "dense (Dense)                (None, 3)                 2307      \n",
      "=================================================================\n",
      "Total params: 82,120,707\n",
      "Trainable params: 82,120,707\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "with strategy.scope():\n",
    "    classifier_model = build_model()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 88
    },
    "id": "-1xkTHmufz4V",
    "outputId": "1dca1814-55b7-425b-bf96-c7e98bcc7b0a"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.6/dist-packages/transformers/tokenization_utils_base.py:1773: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(219325, 320) (219325,)\n"
     ]
    }
   ],
   "source": [
    "# data preparation\n",
    "X_data = regular_encode(train.text.values.tolist(), tokenizer, maxlen=transformer_flags.MAX_LEN)\n",
    "y_train = train.label.values\n",
    "print(X_data.shape, y_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 88
    },
    "id": "tSbrXFxjf2wU",
    "outputId": "7cc10000-2c1e-42df-c68c-91b965ddc4a0"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.6/dist-packages/transformers/tokenization_utils_base.py:1773: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(30000, 320) (30000,)\n"
     ]
    }
   ],
   "source": [
    "# data preparation\n",
    "X_val = regular_encode(validate.text.values.tolist(), tokenizer, maxlen=transformer_flags.MAX_LEN)\n",
    "y_val = validate.label.values\n",
    "print(X_val.shape, y_val.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 71
    },
    "id": "qbywnVJtf4r7",
    "outputId": "b456332e-5e0b-41fb-c3be-b51a8ab94b64"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.6/dist-packages/transformers/tokenization_utils_base.py:1773: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n"
     ]
    }
   ],
   "source": [
    "X_test = regular_encode(test_df.text.values.tolist(), tokenizer, maxlen=transformer_flags.MAX_LEN)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 292
    },
    "id": "PS90P7BKf6q-",
    "outputId": "096c51ff-d18d-42c6-b6f2-80e975d598c5"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/2\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/data/ops/multi_device_iterator_ops.py:601: get_next_as_optional (from tensorflow.python.data.ops.iterator_ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use `tf.data.Iterator.get_next_as_optional()` instead.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/data/ops/multi_device_iterator_ops.py:601: get_next_as_optional (from tensorflow.python.data.ops.iterator_ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use `tf.data.Iterator.get_next_as_optional()` instead.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   2/3427 [..............................] - ETA: 8:45:43 - loss: 1.0172 - accuracy: 0.4453 WARNING:tensorflow:Callbacks method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0052s vs `on_train_batch_end` time: 0.1117s). Check your callbacks.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Callbacks method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0052s vs `on_train_batch_end` time: 0.1117s). Check your callbacks.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3427/3427 [==============================] - ETA: 0s - loss: 0.5160 - accuracy: 0.8018WARNING:tensorflow:Callbacks method `on_test_batch_end` is slow compared to the batch time (batch time: 0.0027s vs `on_test_batch_end` time: 0.0348s). Check your callbacks.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Callbacks method `on_test_batch_end` is slow compared to the batch time (batch time: 0.0027s vs `on_test_batch_end` time: 0.0348s). Check your callbacks.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3427/3427 [==============================] - 441s 129ms/step - loss: 0.5160 - accuracy: 0.8018 - val_loss: 0.4766 - val_accuracy: 0.8222\n",
      "Epoch 2/2\n",
      "3427/3427 [==============================] - 411s 120ms/step - loss: 0.4561 - accuracy: 0.8290 - val_loss: 0.4345 - val_accuracy: 0.8351\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.keras.callbacks.History at 0x7fa1216f7c88>"
      ]
     },
     "execution_count": 19,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 0:200000+30000 datapoint\n",
    "classifier_model.fit(X_data, y_train, epochs=2,batch_size=64, validation_data=(X_val, y_val))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 190
    },
    "id": "wu2rQOuRkYA-",
    "outputId": "6e97d359-da22-4154-b929-0ab12e843253"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/2\n",
      "   2/3125 [..............................] - ETA: 6:42 - loss: 0.3951 - accuracy: 0.8281WARNING:tensorflow:Callbacks method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0052s vs `on_train_batch_end` time: 0.0956s). Check your callbacks.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Callbacks method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0052s vs `on_train_batch_end` time: 0.0956s). Check your callbacks.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3125/3125 [==============================] - ETA: 0s - loss: 0.4361 - accuracy: 0.8356WARNING:tensorflow:Callbacks method `on_test_batch_end` is slow compared to the batch time (batch time: 0.0028s vs `on_test_batch_end` time: 0.0344s). Check your callbacks.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Callbacks method `on_test_batch_end` is slow compared to the batch time (batch time: 0.0028s vs `on_test_batch_end` time: 0.0344s). Check your callbacks.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3125/3125 [==============================] - 341s 109ms/step - loss: 0.4361 - accuracy: 0.8356 - val_loss: 0.4267 - val_accuracy: 0.8404\n",
      "Epoch 2/2\n",
      "3125/3125 [==============================] - 342s 109ms/step - loss: 0.4120 - accuracy: 0.8450 - val_loss: 0.4350 - val_accuracy: 0.8392\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.keras.callbacks.History at 0x7f2d46ac1d30>"
      ]
     },
     "execution_count": 25,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 230000:430000 datapoint\n",
    "#classifier_model.fit(X_data, y_train, epochs=2,batch_size=64, validation_data=(X_val, y_val))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 139
    },
    "id": "ol4yPSxAf82j",
    "outputId": "90450529-6554-4b94-c546-8de449ef0a7d"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  2/235 [..............................] - ETA: 9:13WARNING:tensorflow:Callbacks method `on_predict_batch_end` is slow compared to the batch time (batch time: 0.0024s vs `on_predict_batch_end` time: 0.0485s). Check your callbacks.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Callbacks method `on_predict_batch_end` is slow compared to the batch time (batch time: 0.0024s vs `on_predict_batch_end` time: 0.0485s). Check your callbacks.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "235/235 [==============================] - 20s 86ms/step\n",
      "  2/235 [..............................] - ETA: 6sWARNING:tensorflow:Callbacks method `on_predict_batch_end` is slow compared to the batch time (batch time: 0.0038s vs `on_predict_batch_end` time: 0.0481s). Check your callbacks.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Callbacks method `on_predict_batch_end` is slow compared to the batch time (batch time: 0.0038s vs `on_predict_batch_end` time: 0.0481s). Check your callbacks.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "235/235 [==============================] - 12s 51ms/step\n"
     ]
    }
   ],
   "source": [
    "y_val_pred = classifier_model.predict(X_val, batch_size=global_batch_size, verbose=1)\n",
    "y_test_pred = classifier_model.predict(X_test, batch_size=global_batch_size, verbose=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "id": "-5yFAhzNgA9R"
   },
   "outputs": [],
   "source": [
    "eval_df = pd.DataFrame({'X':y_val_pred.tolist(), 'y':y_val.tolist()})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "id": "yAV3kgefgBAD"
   },
   "outputs": [],
   "source": [
    "eval_df.to_csv('distilroberta-base-maxlen320-PL-2epochs-90K87K40K-eval-prob.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "id": "s_t_4uvYgA7K"
   },
   "outputs": [],
   "source": [
    "submission = pd.read_csv('sample submission.csv')\n",
    "submission['label'] = y_test_pred.tolist()\n",
    "file_name = 'distilroberta-base-maxlen320-PL-2epochs-90K87K40K-test-prob.csv'\n",
    "submission.to_csv(file_name, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "_VnzD4wygMTm"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "TPU",
  "colab": {
   "authorship_tag": "ABX9TyOm7JtD2B2NEreYH85dShhq",
   "include_colab_link": true,
   "name": "MachineHack Distilbert.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
